Here below I extract the csv files and require the necessary libraries.
library(caret)
library(AppliedPredictiveModeling)
library(ElemStatLearn)
library(boot)
library(earth)
library(mda)
library(MASS)
library(adabag)
library(mlbench)
library(plyr)
library(party)
library(grid)
library(mvtnorm)
library(modeltools)
library(kernlab)
library(nnet)
Url<-url("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-training.csv")
Url2<-url("https://d396qusza40orc.cloudfront.net/predmachlearn/pml-testing.csv")
training<-read.csv(Url)
testing<-read.csv(Url2)
training<-training[,2:ncol(training)]
testing<-testing[,2:ncol(testing)]
First let’s look at the size of the training dataset.
summary(training)
## user_name raw_timestamp_part_1 raw_timestamp_part_2
## adelmo :3892 Min. :1.322e+09 Min. : 294
## carlitos:3112 1st Qu.:1.323e+09 1st Qu.:252912
## charles :3536 Median :1.323e+09 Median :496380
## eurico :3070 Mean :1.323e+09 Mean :500656
## jeremy :3402 3rd Qu.:1.323e+09 3rd Qu.:751891
## pedro :2610 Max. :1.323e+09 Max. :998801
##
## cvtd_timestamp new_window num_window roll_belt
## 28/11/2011 14:14: 1498 no :19216 Min. : 1.0 Min. :-28.90
## 05/12/2011 11:24: 1497 yes: 406 1st Qu.:222.0 1st Qu.: 1.10
## 30/11/2011 17:11: 1440 Median :424.0 Median :113.00
## 05/12/2011 11:25: 1425 Mean :430.6 Mean : 64.41
## 02/12/2011 14:57: 1380 3rd Qu.:644.0 3rd Qu.:123.00
## 02/12/2011 13:34: 1375 Max. :864.0 Max. :162.00
## (Other) :11007
## pitch_belt yaw_belt total_accel_belt kurtosis_roll_belt
## Min. :-55.8000 Min. :-180.00 Min. : 0.00 :19216
## 1st Qu.: 1.7600 1st Qu.: -88.30 1st Qu.: 3.00 #DIV/0! : 10
## Median : 5.2800 Median : -13.00 Median :17.00 -1.908453: 2
## Mean : 0.3053 Mean : -11.21 Mean :11.31 -0.016850: 1
## 3rd Qu.: 14.9000 3rd Qu.: 12.90 3rd Qu.:18.00 -0.021024: 1
## Max. : 60.3000 Max. : 179.00 Max. :29.00 -0.025513: 1
## (Other) : 391
## kurtosis_picth_belt kurtosis_yaw_belt skewness_roll_belt
## :19216 :19216 :19216
## #DIV/0! : 32 #DIV/0!: 406 #DIV/0! : 9
## 47.000000: 4 0.000000 : 4
## -0.150950: 3 0.422463 : 2
## -0.684748: 3 -0.003095: 1
## -1.750749: 3 -0.010002: 1
## (Other) : 361 (Other) : 389
## skewness_roll_belt.1 skewness_yaw_belt max_roll_belt max_picth_belt
## :19216 :19216 Min. :-94.300 Min. : 3.00
## #DIV/0! : 32 #DIV/0!: 406 1st Qu.:-88.000 1st Qu.: 5.00
## 0.000000 : 4 Median : -5.100 Median :18.00
## -2.156553: 3 Mean : -6.667 Mean :12.92
## -3.072669: 3 3rd Qu.: 18.500 3rd Qu.:19.00
## -6.324555: 3 Max. :180.000 Max. :30.00
## (Other) : 361 NA's :19216 NA's :19216
## max_yaw_belt min_roll_belt min_pitch_belt min_yaw_belt
## :19216 Min. :-180.00 Min. : 0.00 :19216
## -1.1 : 30 1st Qu.: -88.40 1st Qu.: 3.00 -1.1 : 30
## -1.4 : 29 Median : -7.85 Median :16.00 -1.4 : 29
## -1.2 : 26 Mean : -10.44 Mean :10.76 -1.2 : 26
## -0.9 : 24 3rd Qu.: 9.05 3rd Qu.:17.00 -0.9 : 24
## -1.3 : 22 Max. : 173.00 Max. :23.00 -1.3 : 22
## (Other): 275 NA's :19216 NA's :19216 (Other): 275
## amplitude_roll_belt amplitude_pitch_belt amplitude_yaw_belt
## Min. : 0.000 Min. : 0.000 :19216
## 1st Qu.: 0.300 1st Qu.: 1.000 #DIV/0!: 10
## Median : 1.000 Median : 1.000 0.00 : 12
## Mean : 3.769 Mean : 2.167 0.0000 : 384
## 3rd Qu.: 2.083 3rd Qu.: 2.000
## Max. :360.000 Max. :12.000
## NA's :19216 NA's :19216
## var_total_accel_belt avg_roll_belt stddev_roll_belt var_roll_belt
## Min. : 0.000 Min. :-27.40 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.100 1st Qu.: 1.10 1st Qu.: 0.200 1st Qu.: 0.000
## Median : 0.200 Median :116.35 Median : 0.400 Median : 0.100
## Mean : 0.926 Mean : 68.06 Mean : 1.337 Mean : 7.699
## 3rd Qu.: 0.300 3rd Qu.:123.38 3rd Qu.: 0.700 3rd Qu.: 0.500
## Max. :16.500 Max. :157.40 Max. :14.200 Max. :200.700
## NA's :19216 NA's :19216 NA's :19216 NA's :19216
## avg_pitch_belt stddev_pitch_belt var_pitch_belt avg_yaw_belt
## Min. :-51.400 Min. :0.000 Min. : 0.000 Min. :-138.300
## 1st Qu.: 2.025 1st Qu.:0.200 1st Qu.: 0.000 1st Qu.: -88.175
## Median : 5.200 Median :0.400 Median : 0.100 Median : -6.550
## Mean : 0.520 Mean :0.603 Mean : 0.766 Mean : -8.831
## 3rd Qu.: 15.775 3rd Qu.:0.700 3rd Qu.: 0.500 3rd Qu.: 14.125
## Max. : 59.700 Max. :4.000 Max. :16.200 Max. : 173.500
## NA's :19216 NA's :19216 NA's :19216 NA's :19216
## stddev_yaw_belt var_yaw_belt gyros_belt_x
## Min. : 0.000 Min. : 0.000 Min. :-1.040000
## 1st Qu.: 0.100 1st Qu.: 0.010 1st Qu.:-0.030000
## Median : 0.300 Median : 0.090 Median : 0.030000
## Mean : 1.341 Mean : 107.487 Mean :-0.005592
## 3rd Qu.: 0.700 3rd Qu.: 0.475 3rd Qu.: 0.110000
## Max. :176.600 Max. :31183.240 Max. : 2.220000
## NA's :19216 NA's :19216
## gyros_belt_y gyros_belt_z accel_belt_x accel_belt_y
## Min. :-0.64000 Min. :-1.4600 Min. :-120.000 Min. :-69.00
## 1st Qu.: 0.00000 1st Qu.:-0.2000 1st Qu.: -21.000 1st Qu.: 3.00
## Median : 0.02000 Median :-0.1000 Median : -15.000 Median : 35.00
## Mean : 0.03959 Mean :-0.1305 Mean : -5.595 Mean : 30.15
## 3rd Qu.: 0.11000 3rd Qu.:-0.0200 3rd Qu.: -5.000 3rd Qu.: 61.00
## Max. : 0.64000 Max. : 1.6200 Max. : 85.000 Max. :164.00
##
## accel_belt_z magnet_belt_x magnet_belt_y magnet_belt_z
## Min. :-275.00 Min. :-52.0 Min. :354.0 Min. :-623.0
## 1st Qu.:-162.00 1st Qu.: 9.0 1st Qu.:581.0 1st Qu.:-375.0
## Median :-152.00 Median : 35.0 Median :601.0 Median :-320.0
## Mean : -72.59 Mean : 55.6 Mean :593.7 Mean :-345.5
## 3rd Qu.: 27.00 3rd Qu.: 59.0 3rd Qu.:610.0 3rd Qu.:-306.0
## Max. : 105.00 Max. :485.0 Max. :673.0 Max. : 293.0
##
## roll_arm pitch_arm yaw_arm total_accel_arm
## Min. :-180.00 Min. :-88.800 Min. :-180.0000 Min. : 1.00
## 1st Qu.: -31.77 1st Qu.:-25.900 1st Qu.: -43.1000 1st Qu.:17.00
## Median : 0.00 Median : 0.000 Median : 0.0000 Median :27.00
## Mean : 17.83 Mean : -4.612 Mean : -0.6188 Mean :25.51
## 3rd Qu.: 77.30 3rd Qu.: 11.200 3rd Qu.: 45.8750 3rd Qu.:33.00
## Max. : 180.00 Max. : 88.500 Max. : 180.0000 Max. :66.00
##
## var_accel_arm avg_roll_arm stddev_roll_arm var_roll_arm
## Min. : 0.00 Min. :-166.67 Min. : 0.000 Min. : 0.000
## 1st Qu.: 9.03 1st Qu.: -38.37 1st Qu.: 1.376 1st Qu.: 1.898
## Median : 40.61 Median : 0.00 Median : 5.702 Median : 32.517
## Mean : 53.23 Mean : 12.68 Mean : 11.201 Mean : 417.264
## 3rd Qu.: 75.62 3rd Qu.: 76.33 3rd Qu.: 14.921 3rd Qu.: 222.647
## Max. :331.70 Max. : 163.33 Max. :161.964 Max. :26232.208
## NA's :19216 NA's :19216 NA's :19216 NA's :19216
## avg_pitch_arm stddev_pitch_arm var_pitch_arm avg_yaw_arm
## Min. :-81.773 Min. : 0.000 Min. : 0.000 Min. :-173.440
## 1st Qu.:-22.770 1st Qu.: 1.642 1st Qu.: 2.697 1st Qu.: -29.198
## Median : 0.000 Median : 8.133 Median : 66.146 Median : 0.000
## Mean : -4.901 Mean :10.383 Mean : 195.864 Mean : 2.359
## 3rd Qu.: 8.277 3rd Qu.:16.327 3rd Qu.: 266.576 3rd Qu.: 38.185
## Max. : 75.659 Max. :43.412 Max. :1884.565 Max. : 152.000
## NA's :19216 NA's :19216 NA's :19216 NA's :19216
## stddev_yaw_arm var_yaw_arm gyros_arm_x
## Min. : 0.000 Min. : 0.000 Min. :-6.37000
## 1st Qu.: 2.577 1st Qu.: 6.642 1st Qu.:-1.33000
## Median : 16.682 Median : 278.309 Median : 0.08000
## Mean : 22.270 Mean : 1055.933 Mean : 0.04277
## 3rd Qu.: 35.984 3rd Qu.: 1294.850 3rd Qu.: 1.57000
## Max. :177.044 Max. :31344.568 Max. : 4.87000
## NA's :19216 NA's :19216
## gyros_arm_y gyros_arm_z accel_arm_x accel_arm_y
## Min. :-3.4400 Min. :-2.3300 Min. :-404.00 Min. :-318.0
## 1st Qu.:-0.8000 1st Qu.:-0.0700 1st Qu.:-242.00 1st Qu.: -54.0
## Median :-0.2400 Median : 0.2300 Median : -44.00 Median : 14.0
## Mean :-0.2571 Mean : 0.2695 Mean : -60.24 Mean : 32.6
## 3rd Qu.: 0.1400 3rd Qu.: 0.7200 3rd Qu.: 84.00 3rd Qu.: 139.0
## Max. : 2.8400 Max. : 3.0200 Max. : 437.00 Max. : 308.0
##
## accel_arm_z magnet_arm_x magnet_arm_y magnet_arm_z
## Min. :-636.00 Min. :-584.0 Min. :-392.0 Min. :-597.0
## 1st Qu.:-143.00 1st Qu.:-300.0 1st Qu.: -9.0 1st Qu.: 131.2
## Median : -47.00 Median : 289.0 Median : 202.0 Median : 444.0
## Mean : -71.25 Mean : 191.7 Mean : 156.6 Mean : 306.5
## 3rd Qu.: 23.00 3rd Qu.: 637.0 3rd Qu.: 323.0 3rd Qu.: 545.0
## Max. : 292.00 Max. : 782.0 Max. : 583.0 Max. : 694.0
##
## kurtosis_roll_arm kurtosis_picth_arm kurtosis_yaw_arm skewness_roll_arm
## :19216 :19216 :19216 :19216
## #DIV/0! : 78 #DIV/0! : 80 #DIV/0! : 11 #DIV/0! : 77
## -0.02438: 1 -0.00484: 1 0.55844 : 2 -0.00051: 1
## -0.04190: 1 -0.01311: 1 0.65132 : 2 -0.00696: 1
## -0.05051: 1 -0.02967: 1 -0.01548: 1 -0.01884: 1
## -0.05695: 1 -0.07394: 1 -0.01749: 1 -0.03359: 1
## (Other) : 324 (Other) : 322 (Other) : 389 (Other) : 325
## skewness_pitch_arm skewness_yaw_arm max_roll_arm max_picth_arm
## :19216 :19216 Min. :-73.100 Min. :-173.000
## #DIV/0! : 80 #DIV/0! : 11 1st Qu.: -0.175 1st Qu.: -1.975
## -0.00184: 1 -1.62032: 2 Median : 4.950 Median : 23.250
## -0.01185: 1 0.55053 : 2 Mean : 11.236 Mean : 35.751
## -0.01247: 1 -0.00311: 1 3rd Qu.: 26.775 3rd Qu.: 95.975
## -0.02063: 1 -0.00562: 1 Max. : 85.500 Max. : 180.000
## (Other) : 322 (Other) : 389 NA's :19216 NA's :19216
## max_yaw_arm min_roll_arm min_pitch_arm min_yaw_arm
## Min. : 4.00 Min. :-89.10 Min. :-180.00 Min. : 1.00
## 1st Qu.:29.00 1st Qu.:-41.98 1st Qu.: -72.62 1st Qu.: 8.00
## Median :34.00 Median :-22.45 Median : -33.85 Median :13.00
## Mean :35.46 Mean :-21.22 Mean : -33.92 Mean :14.66
## 3rd Qu.:41.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.:19.00
## Max. :65.00 Max. : 66.40 Max. : 152.00 Max. :38.00
## NA's :19216 NA's :19216 NA's :19216 NA's :19216
## amplitude_roll_arm amplitude_pitch_arm amplitude_yaw_arm
## Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 5.425 1st Qu.: 9.925 1st Qu.:13.00
## Median : 28.450 Median : 54.900 Median :22.00
## Mean : 32.452 Mean : 69.677 Mean :20.79
## 3rd Qu.: 50.960 3rd Qu.:115.175 3rd Qu.:28.75
## Max. :119.500 Max. :360.000 Max. :52.00
## NA's :19216 NA's :19216 NA's :19216
## roll_dumbbell pitch_dumbbell yaw_dumbbell
## Min. :-153.71 Min. :-149.59 Min. :-150.871
## 1st Qu.: -18.49 1st Qu.: -40.89 1st Qu.: -77.644
## Median : 48.17 Median : -20.96 Median : -3.324
## Mean : 23.84 Mean : -10.78 Mean : 1.674
## 3rd Qu.: 67.61 3rd Qu.: 17.50 3rd Qu.: 79.643
## Max. : 153.55 Max. : 149.40 Max. : 154.952
##
## kurtosis_roll_dumbbell kurtosis_picth_dumbbell kurtosis_yaw_dumbbell
## :19216 :19216 :19216
## #DIV/0!: 5 -0.5464: 2 #DIV/0!: 406
## -0.2583: 2 -0.9334: 2
## -0.3705: 2 -2.0833: 2
## -0.5855: 2 -2.0851: 2
## -2.0851: 2 -2.0889: 2
## (Other): 393 (Other): 396
## skewness_roll_dumbbell skewness_pitch_dumbbell skewness_yaw_dumbbell
## :19216 :19216 :19216
## #DIV/0!: 4 -0.2328: 2 #DIV/0!: 406
## -0.9324: 2 -0.3521: 2
## 0.1110 : 2 -0.7036: 2
## 1.0312 : 2 0.1090 : 2
## -0.0082: 1 1.0326 : 2
## (Other): 395 (Other): 396
## max_roll_dumbbell max_picth_dumbbell max_yaw_dumbbell min_roll_dumbbell
## Min. :-70.10 Min. :-112.90 :19216 Min. :-149.60
## 1st Qu.:-27.15 1st Qu.: -66.70 -0.6 : 20 1st Qu.: -59.67
## Median : 14.85 Median : 40.05 0.2 : 19 Median : -43.55
## Mean : 13.76 Mean : 32.75 -0.8 : 18 Mean : -41.24
## 3rd Qu.: 50.58 3rd Qu.: 133.22 -0.3 : 16 3rd Qu.: -25.20
## Max. :137.00 Max. : 155.00 -0.2 : 15 Max. : 73.20
## NA's :19216 NA's :19216 (Other): 318 NA's :19216
## min_pitch_dumbbell min_yaw_dumbbell amplitude_roll_dumbbell
## Min. :-147.00 :19216 Min. : 0.00
## 1st Qu.: -91.80 -0.6 : 20 1st Qu.: 14.97
## Median : -66.15 0.2 : 19 Median : 35.05
## Mean : -33.18 -0.8 : 18 Mean : 55.00
## 3rd Qu.: 21.20 -0.3 : 16 3rd Qu.: 81.04
## Max. : 120.90 -0.2 : 15 Max. :256.48
## NA's :19216 (Other): 318 NA's :19216
## amplitude_pitch_dumbbell amplitude_yaw_dumbbell total_accel_dumbbell
## Min. : 0.00 :19216 Min. : 0.00
## 1st Qu.: 17.06 #DIV/0!: 5 1st Qu.: 4.00
## Median : 41.73 0.00 : 401 Median :10.00
## Mean : 65.93 Mean :13.72
## 3rd Qu.: 99.55 3rd Qu.:19.00
## Max. :273.59 Max. :58.00
## NA's :19216
## var_accel_dumbbell avg_roll_dumbbell stddev_roll_dumbbell
## Min. : 0.000 Min. :-128.96 Min. : 0.000
## 1st Qu.: 0.378 1st Qu.: -12.33 1st Qu.: 4.639
## Median : 1.000 Median : 48.23 Median : 12.204
## Mean : 4.388 Mean : 23.86 Mean : 20.761
## 3rd Qu.: 3.434 3rd Qu.: 64.37 3rd Qu.: 26.356
## Max. :230.428 Max. : 125.99 Max. :123.778
## NA's :19216 NA's :19216 NA's :19216
## var_roll_dumbbell avg_pitch_dumbbell stddev_pitch_dumbbell
## Min. : 0.00 Min. :-70.73 Min. : 0.000
## 1st Qu.: 21.52 1st Qu.:-42.00 1st Qu.: 3.482
## Median : 148.95 Median :-19.91 Median : 8.089
## Mean : 1020.27 Mean :-12.33 Mean :13.147
## 3rd Qu.: 694.65 3rd Qu.: 13.21 3rd Qu.:19.238
## Max. :15321.01 Max. : 94.28 Max. :82.680
## NA's :19216 NA's :19216 NA's :19216
## var_pitch_dumbbell avg_yaw_dumbbell stddev_yaw_dumbbell
## Min. : 0.00 Min. :-117.950 Min. : 0.000
## 1st Qu.: 12.12 1st Qu.: -76.696 1st Qu.: 3.885
## Median : 65.44 Median : -4.505 Median : 10.264
## Mean : 350.31 Mean : 0.202 Mean : 16.647
## 3rd Qu.: 370.11 3rd Qu.: 71.234 3rd Qu.: 24.674
## Max. :6836.02 Max. : 134.905 Max. :107.088
## NA's :19216 NA's :19216 NA's :19216
## var_yaw_dumbbell gyros_dumbbell_x gyros_dumbbell_y
## Min. : 0.00 Min. :-204.0000 Min. :-2.10000
## 1st Qu.: 15.09 1st Qu.: -0.0300 1st Qu.:-0.14000
## Median : 105.35 Median : 0.1300 Median : 0.03000
## Mean : 589.84 Mean : 0.1611 Mean : 0.04606
## 3rd Qu.: 608.79 3rd Qu.: 0.3500 3rd Qu.: 0.21000
## Max. :11467.91 Max. : 2.2200 Max. :52.00000
## NA's :19216
## gyros_dumbbell_z accel_dumbbell_x accel_dumbbell_y accel_dumbbell_z
## Min. : -2.380 Min. :-419.00 Min. :-189.00 Min. :-334.00
## 1st Qu.: -0.310 1st Qu.: -50.00 1st Qu.: -8.00 1st Qu.:-142.00
## Median : -0.130 Median : -8.00 Median : 41.50 Median : -1.00
## Mean : -0.129 Mean : -28.62 Mean : 52.63 Mean : -38.32
## 3rd Qu.: 0.030 3rd Qu.: 11.00 3rd Qu.: 111.00 3rd Qu.: 38.00
## Max. :317.000 Max. : 235.00 Max. : 315.00 Max. : 318.00
##
## magnet_dumbbell_x magnet_dumbbell_y magnet_dumbbell_z roll_forearm
## Min. :-643.0 Min. :-3600 Min. :-262.00 Min. :-180.0000
## 1st Qu.:-535.0 1st Qu.: 231 1st Qu.: -45.00 1st Qu.: -0.7375
## Median :-479.0 Median : 311 Median : 13.00 Median : 21.7000
## Mean :-328.5 Mean : 221 Mean : 46.05 Mean : 33.8265
## 3rd Qu.:-304.0 3rd Qu.: 390 3rd Qu.: 95.00 3rd Qu.: 140.0000
## Max. : 592.0 Max. : 633 Max. : 452.00 Max. : 180.0000
##
## pitch_forearm yaw_forearm kurtosis_roll_forearm
## Min. :-72.50 Min. :-180.00 :19216
## 1st Qu.: 0.00 1st Qu.: -68.60 #DIV/0!: 84
## Median : 9.24 Median : 0.00 -0.8079: 2
## Mean : 10.71 Mean : 19.21 -0.9169: 2
## 3rd Qu.: 28.40 3rd Qu.: 110.00 -0.0227: 1
## Max. : 89.80 Max. : 180.00 -0.0359: 1
## (Other): 316
## kurtosis_picth_forearm kurtosis_yaw_forearm skewness_roll_forearm
## :19216 :19216 :19216
## #DIV/0!: 85 #DIV/0!: 406 #DIV/0!: 83
## -0.0073: 1 -0.1912: 2
## -0.0442: 1 -0.4126: 2
## -0.0489: 1 -0.0004: 1
## -0.0523: 1 -0.0013: 1
## (Other): 317 (Other): 317
## skewness_pitch_forearm skewness_yaw_forearm max_roll_forearm
## :19216 :19216 Min. :-66.60
## #DIV/0!: 85 #DIV/0!: 406 1st Qu.: 0.00
## 0.0000 : 4 Median : 26.80
## -0.6992: 2 Mean : 24.49
## -0.0113: 1 3rd Qu.: 45.95
## -0.0131: 1 Max. : 89.80
## (Other): 313 NA's :19216
## max_picth_forearm max_yaw_forearm min_roll_forearm min_pitch_forearm
## Min. :-151.00 :19216 Min. :-72.500 Min. :-180.00
## 1st Qu.: 0.00 #DIV/0!: 84 1st Qu.: -6.075 1st Qu.:-175.00
## Median : 113.00 -1.2 : 32 Median : 0.000 Median : -61.00
## Mean : 81.49 -1.3 : 31 Mean : -0.167 Mean : -57.57
## 3rd Qu.: 174.75 -1.4 : 24 3rd Qu.: 12.075 3rd Qu.: 0.00
## Max. : 180.00 -1.5 : 24 Max. : 62.100 Max. : 167.00
## NA's :19216 (Other): 211 NA's :19216 NA's :19216
## min_yaw_forearm amplitude_roll_forearm amplitude_pitch_forearm
## :19216 Min. : 0.000 Min. : 0.0
## #DIV/0!: 84 1st Qu.: 1.125 1st Qu.: 2.0
## -1.2 : 32 Median : 17.770 Median : 83.7
## -1.3 : 31 Mean : 24.653 Mean :139.1
## -1.4 : 24 3rd Qu.: 39.875 3rd Qu.:350.0
## -1.5 : 24 Max. :126.000 Max. :360.0
## (Other): 211 NA's :19216 NA's :19216
## amplitude_yaw_forearm total_accel_forearm var_accel_forearm
## :19216 Min. : 0.00 Min. : 0.000
## #DIV/0!: 84 1st Qu.: 29.00 1st Qu.: 6.759
## 0.00 : 322 Median : 36.00 Median : 21.165
## Mean : 34.72 Mean : 33.502
## 3rd Qu.: 41.00 3rd Qu.: 51.240
## Max. :108.00 Max. :172.606
## NA's :19216
## avg_roll_forearm stddev_roll_forearm var_roll_forearm
## Min. :-177.234 Min. : 0.000 Min. : 0.00
## 1st Qu.: -0.909 1st Qu.: 0.428 1st Qu.: 0.18
## Median : 11.172 Median : 8.030 Median : 64.48
## Mean : 33.165 Mean : 41.986 Mean : 5274.10
## 3rd Qu.: 107.132 3rd Qu.: 85.373 3rd Qu.: 7289.08
## Max. : 177.256 Max. :179.171 Max. :32102.24
## NA's :19216 NA's :19216 NA's :19216
## avg_pitch_forearm stddev_pitch_forearm var_pitch_forearm
## Min. :-68.17 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.336 1st Qu.: 0.113
## Median : 12.02 Median : 5.516 Median : 30.425
## Mean : 11.79 Mean : 7.977 Mean : 139.593
## 3rd Qu.: 28.48 3rd Qu.:12.866 3rd Qu.: 165.532
## Max. : 72.09 Max. :47.745 Max. :2279.617
## NA's :19216 NA's :19216 NA's :19216
## avg_yaw_forearm stddev_yaw_forearm var_yaw_forearm gyros_forearm_x
## Min. :-155.06 Min. : 0.000 Min. : 0.00 Min. :-22.000
## 1st Qu.: -26.26 1st Qu.: 0.524 1st Qu.: 0.27 1st Qu.: -0.220
## Median : 0.00 Median : 24.743 Median : 612.21 Median : 0.050
## Mean : 18.00 Mean : 44.854 Mean : 4639.85 Mean : 0.158
## 3rd Qu.: 85.79 3rd Qu.: 85.817 3rd Qu.: 7368.41 3rd Qu.: 0.560
## Max. : 169.24 Max. :197.508 Max. :39009.33 Max. : 3.970
## NA's :19216 NA's :19216 NA's :19216
## gyros_forearm_y gyros_forearm_z accel_forearm_x accel_forearm_y
## Min. : -7.02000 Min. : -8.0900 Min. :-498.00 Min. :-632.0
## 1st Qu.: -1.46000 1st Qu.: -0.1800 1st Qu.:-178.00 1st Qu.: 57.0
## Median : 0.03000 Median : 0.0800 Median : -57.00 Median : 201.0
## Mean : 0.07517 Mean : 0.1512 Mean : -61.65 Mean : 163.7
## 3rd Qu.: 1.62000 3rd Qu.: 0.4900 3rd Qu.: 76.00 3rd Qu.: 312.0
## Max. :311.00000 Max. :231.0000 Max. : 477.00 Max. : 923.0
##
## accel_forearm_z magnet_forearm_x magnet_forearm_y magnet_forearm_z
## Min. :-446.00 Min. :-1280.0 Min. :-896.0 Min. :-973.0
## 1st Qu.:-182.00 1st Qu.: -616.0 1st Qu.: 2.0 1st Qu.: 191.0
## Median : -39.00 Median : -378.0 Median : 591.0 Median : 511.0
## Mean : -55.29 Mean : -312.6 Mean : 380.1 Mean : 393.6
## 3rd Qu.: 26.00 3rd Qu.: -73.0 3rd Qu.: 737.0 3rd Qu.: 653.0
## Max. : 291.00 Max. : 672.0 Max. :1480.0 Max. :1090.0
##
## classe
## A:5580
## B:3797
## C:3422
## D:3216
## E:3607
##
##
colnames(training)
## [1] "user_name" "raw_timestamp_part_1"
## [3] "raw_timestamp_part_2" "cvtd_timestamp"
## [5] "new_window" "num_window"
## [7] "roll_belt" "pitch_belt"
## [9] "yaw_belt" "total_accel_belt"
## [11] "kurtosis_roll_belt" "kurtosis_picth_belt"
## [13] "kurtosis_yaw_belt" "skewness_roll_belt"
## [15] "skewness_roll_belt.1" "skewness_yaw_belt"
## [17] "max_roll_belt" "max_picth_belt"
## [19] "max_yaw_belt" "min_roll_belt"
## [21] "min_pitch_belt" "min_yaw_belt"
## [23] "amplitude_roll_belt" "amplitude_pitch_belt"
## [25] "amplitude_yaw_belt" "var_total_accel_belt"
## [27] "avg_roll_belt" "stddev_roll_belt"
## [29] "var_roll_belt" "avg_pitch_belt"
## [31] "stddev_pitch_belt" "var_pitch_belt"
## [33] "avg_yaw_belt" "stddev_yaw_belt"
## [35] "var_yaw_belt" "gyros_belt_x"
## [37] "gyros_belt_y" "gyros_belt_z"
## [39] "accel_belt_x" "accel_belt_y"
## [41] "accel_belt_z" "magnet_belt_x"
## [43] "magnet_belt_y" "magnet_belt_z"
## [45] "roll_arm" "pitch_arm"
## [47] "yaw_arm" "total_accel_arm"
## [49] "var_accel_arm" "avg_roll_arm"
## [51] "stddev_roll_arm" "var_roll_arm"
## [53] "avg_pitch_arm" "stddev_pitch_arm"
## [55] "var_pitch_arm" "avg_yaw_arm"
## [57] "stddev_yaw_arm" "var_yaw_arm"
## [59] "gyros_arm_x" "gyros_arm_y"
## [61] "gyros_arm_z" "accel_arm_x"
## [63] "accel_arm_y" "accel_arm_z"
## [65] "magnet_arm_x" "magnet_arm_y"
## [67] "magnet_arm_z" "kurtosis_roll_arm"
## [69] "kurtosis_picth_arm" "kurtosis_yaw_arm"
## [71] "skewness_roll_arm" "skewness_pitch_arm"
## [73] "skewness_yaw_arm" "max_roll_arm"
## [75] "max_picth_arm" "max_yaw_arm"
## [77] "min_roll_arm" "min_pitch_arm"
## [79] "min_yaw_arm" "amplitude_roll_arm"
## [81] "amplitude_pitch_arm" "amplitude_yaw_arm"
## [83] "roll_dumbbell" "pitch_dumbbell"
## [85] "yaw_dumbbell" "kurtosis_roll_dumbbell"
## [87] "kurtosis_picth_dumbbell" "kurtosis_yaw_dumbbell"
## [89] "skewness_roll_dumbbell" "skewness_pitch_dumbbell"
## [91] "skewness_yaw_dumbbell" "max_roll_dumbbell"
## [93] "max_picth_dumbbell" "max_yaw_dumbbell"
## [95] "min_roll_dumbbell" "min_pitch_dumbbell"
## [97] "min_yaw_dumbbell" "amplitude_roll_dumbbell"
## [99] "amplitude_pitch_dumbbell" "amplitude_yaw_dumbbell"
## [101] "total_accel_dumbbell" "var_accel_dumbbell"
## [103] "avg_roll_dumbbell" "stddev_roll_dumbbell"
## [105] "var_roll_dumbbell" "avg_pitch_dumbbell"
## [107] "stddev_pitch_dumbbell" "var_pitch_dumbbell"
## [109] "avg_yaw_dumbbell" "stddev_yaw_dumbbell"
## [111] "var_yaw_dumbbell" "gyros_dumbbell_x"
## [113] "gyros_dumbbell_y" "gyros_dumbbell_z"
## [115] "accel_dumbbell_x" "accel_dumbbell_y"
## [117] "accel_dumbbell_z" "magnet_dumbbell_x"
## [119] "magnet_dumbbell_y" "magnet_dumbbell_z"
## [121] "roll_forearm" "pitch_forearm"
## [123] "yaw_forearm" "kurtosis_roll_forearm"
## [125] "kurtosis_picth_forearm" "kurtosis_yaw_forearm"
## [127] "skewness_roll_forearm" "skewness_pitch_forearm"
## [129] "skewness_yaw_forearm" "max_roll_forearm"
## [131] "max_picth_forearm" "max_yaw_forearm"
## [133] "min_roll_forearm" "min_pitch_forearm"
## [135] "min_yaw_forearm" "amplitude_roll_forearm"
## [137] "amplitude_pitch_forearm" "amplitude_yaw_forearm"
## [139] "total_accel_forearm" "var_accel_forearm"
## [141] "avg_roll_forearm" "stddev_roll_forearm"
## [143] "var_roll_forearm" "avg_pitch_forearm"
## [145] "stddev_pitch_forearm" "var_pitch_forearm"
## [147] "avg_yaw_forearm" "stddev_yaw_forearm"
## [149] "var_yaw_forearm" "gyros_forearm_x"
## [151] "gyros_forearm_y" "gyros_forearm_z"
## [153] "accel_forearm_x" "accel_forearm_y"
## [155] "accel_forearm_z" "magnet_forearm_x"
## [157] "magnet_forearm_y" "magnet_forearm_z"
## [159] "classe"
colnames(testing)
## [1] "user_name" "raw_timestamp_part_1"
## [3] "raw_timestamp_part_2" "cvtd_timestamp"
## [5] "new_window" "num_window"
## [7] "roll_belt" "pitch_belt"
## [9] "yaw_belt" "total_accel_belt"
## [11] "kurtosis_roll_belt" "kurtosis_picth_belt"
## [13] "kurtosis_yaw_belt" "skewness_roll_belt"
## [15] "skewness_roll_belt.1" "skewness_yaw_belt"
## [17] "max_roll_belt" "max_picth_belt"
## [19] "max_yaw_belt" "min_roll_belt"
## [21] "min_pitch_belt" "min_yaw_belt"
## [23] "amplitude_roll_belt" "amplitude_pitch_belt"
## [25] "amplitude_yaw_belt" "var_total_accel_belt"
## [27] "avg_roll_belt" "stddev_roll_belt"
## [29] "var_roll_belt" "avg_pitch_belt"
## [31] "stddev_pitch_belt" "var_pitch_belt"
## [33] "avg_yaw_belt" "stddev_yaw_belt"
## [35] "var_yaw_belt" "gyros_belt_x"
## [37] "gyros_belt_y" "gyros_belt_z"
## [39] "accel_belt_x" "accel_belt_y"
## [41] "accel_belt_z" "magnet_belt_x"
## [43] "magnet_belt_y" "magnet_belt_z"
## [45] "roll_arm" "pitch_arm"
## [47] "yaw_arm" "total_accel_arm"
## [49] "var_accel_arm" "avg_roll_arm"
## [51] "stddev_roll_arm" "var_roll_arm"
## [53] "avg_pitch_arm" "stddev_pitch_arm"
## [55] "var_pitch_arm" "avg_yaw_arm"
## [57] "stddev_yaw_arm" "var_yaw_arm"
## [59] "gyros_arm_x" "gyros_arm_y"
## [61] "gyros_arm_z" "accel_arm_x"
## [63] "accel_arm_y" "accel_arm_z"
## [65] "magnet_arm_x" "magnet_arm_y"
## [67] "magnet_arm_z" "kurtosis_roll_arm"
## [69] "kurtosis_picth_arm" "kurtosis_yaw_arm"
## [71] "skewness_roll_arm" "skewness_pitch_arm"
## [73] "skewness_yaw_arm" "max_roll_arm"
## [75] "max_picth_arm" "max_yaw_arm"
## [77] "min_roll_arm" "min_pitch_arm"
## [79] "min_yaw_arm" "amplitude_roll_arm"
## [81] "amplitude_pitch_arm" "amplitude_yaw_arm"
## [83] "roll_dumbbell" "pitch_dumbbell"
## [85] "yaw_dumbbell" "kurtosis_roll_dumbbell"
## [87] "kurtosis_picth_dumbbell" "kurtosis_yaw_dumbbell"
## [89] "skewness_roll_dumbbell" "skewness_pitch_dumbbell"
## [91] "skewness_yaw_dumbbell" "max_roll_dumbbell"
## [93] "max_picth_dumbbell" "max_yaw_dumbbell"
## [95] "min_roll_dumbbell" "min_pitch_dumbbell"
## [97] "min_yaw_dumbbell" "amplitude_roll_dumbbell"
## [99] "amplitude_pitch_dumbbell" "amplitude_yaw_dumbbell"
## [101] "total_accel_dumbbell" "var_accel_dumbbell"
## [103] "avg_roll_dumbbell" "stddev_roll_dumbbell"
## [105] "var_roll_dumbbell" "avg_pitch_dumbbell"
## [107] "stddev_pitch_dumbbell" "var_pitch_dumbbell"
## [109] "avg_yaw_dumbbell" "stddev_yaw_dumbbell"
## [111] "var_yaw_dumbbell" "gyros_dumbbell_x"
## [113] "gyros_dumbbell_y" "gyros_dumbbell_z"
## [115] "accel_dumbbell_x" "accel_dumbbell_y"
## [117] "accel_dumbbell_z" "magnet_dumbbell_x"
## [119] "magnet_dumbbell_y" "magnet_dumbbell_z"
## [121] "roll_forearm" "pitch_forearm"
## [123] "yaw_forearm" "kurtosis_roll_forearm"
## [125] "kurtosis_picth_forearm" "kurtosis_yaw_forearm"
## [127] "skewness_roll_forearm" "skewness_pitch_forearm"
## [129] "skewness_yaw_forearm" "max_roll_forearm"
## [131] "max_picth_forearm" "max_yaw_forearm"
## [133] "min_roll_forearm" "min_pitch_forearm"
## [135] "min_yaw_forearm" "amplitude_roll_forearm"
## [137] "amplitude_pitch_forearm" "amplitude_yaw_forearm"
## [139] "total_accel_forearm" "var_accel_forearm"
## [141] "avg_roll_forearm" "stddev_roll_forearm"
## [143] "var_roll_forearm" "avg_pitch_forearm"
## [145] "stddev_pitch_forearm" "var_pitch_forearm"
## [147] "avg_yaw_forearm" "stddev_yaw_forearm"
## [149] "var_yaw_forearm" "gyros_forearm_x"
## [151] "gyros_forearm_y" "gyros_forearm_z"
## [153] "accel_forearm_x" "accel_forearm_y"
## [155] "accel_forearm_z" "magnet_forearm_x"
## [157] "magnet_forearm_y" "magnet_forearm_z"
## [159] "problem_id"
dim(training)
## [1] 19622 159
table(training$user_name)
##
## adelmo carlitos charles eurico jeremy pedro
## 3892 3112 3536 3070 3402 2610
So the training set has 19622 observations and 159 variables, with around 3000 observations per subject. Below I divide the variables according to body parts and look at the distribution of each of the the numeric variables. I take out all those variables that have missing values for more than 90% of the times. Furthermore, I divide training into 2 parts, one called to train my predictive model with 70% of the observations and the other to test it.
for(i in 2:ncol(training)){
training[which(training[,i]=="#DIV/0!"),i]<-NA
training[which(training[,i]==""),i]<-NA
}
too_na_cols<-NULL
for(i in 1:ncol(training)){
if (sum(is.na(training[,i]))/nrow(training)>.9) too_na_cols<-c(too_na_cols, i)
}
length(too_na_cols)
## [1] 100
training<-training[,-too_na_cols]
ncol(training)
## [1] 59
numeric_cols<-NULL
for(i in 1:ncol(training)){
if (class(training[,i])=="numeric" | class(training[,i])=="integer") numeric_cols<-c(numeric_cols, i)
}
length(numeric_cols)
## [1] 55
set.seed(1234)
inTrain<-createDataPartition(y=training$classe, p=.7, list=F)
train_train<-training[inTrain, ]
test_train<-training[-inTrain, ]
arm_var<-colnames(train_train)[grep(pattern="_arm", x=(colnames(train_train)))]
dumbbell_var<-colnames(train_train)[grep(pattern="_dumbbell", x=(colnames(train_train)))]
belt_var<-colnames(train_train)[grep(pattern="_belt", x=(colnames(train_train)))]
forearm_var<-colnames(train_train)[grep(pattern="_forearm", x=(colnames(train_train)))]
nrow(train_train)
## [1] 13737
par(mfrow=c(3,2))
for(i in numeric_cols[1:(length(numeric_cols)-1)]){
ord<-which(numeric_cols==i)
col<-colnames(train_train)[numeric_cols[ord]]
for(class in levels(train_train$classe)){
hist(train_train[which(train_train$classe==class),col], main=paste(col,"in class", class), xlab=paste(class))
}
plot.new()
}
I turned out that up to 100 were for 90% of the observations with not available data. I therefore preferred to kick them out of the training dataset. Checking for collinearity among the left variables might help.
mat<-matrix(ncol=length(numeric_cols), nrow=length(numeric_cols))
err<-matrix(ncol=length(numeric_cols), nrow=length(numeric_cols))
diag(mat)<-0
for(i in numeric_cols){
for(j in setdiff(numeric_cols, i) ){
ord1<-which(numeric_cols==i)
ord2<-which(numeric_cols==j)
mat[ord1,ord2]<-cor(as.numeric(train_train[,i]), as.numeric(train_train[,j]), use="pairwise.complete.obs")
if (mat[ord1, ord2]>=.7) err[ord1,ord2]<-1
}
}
There are therefore 22 pairs of variables that are collinear, but in prediction models this is not an issue. Anyway, for computational time purposes, it is still better to perform principal component analysis on the training subset and then run some classification predictive methods, like classification tree here below.
prePROC<-preProcess(x=train_train, method="pca", thresh = .90)
prePROC
## Created from 13737 samples and 59 variables
##
## Pre-processing:
## - centered (55)
## - ignored (4)
## - principal component signal extraction (55)
## - scaled (55)
##
## PCA needed 20 components to capture 90 percent of the variance
trainPC<-predict(prePROC, train_train)
rownames(train_train)<-1:nrow(train_train)
testPC<-predict(prePROC, test_train[, ])
modFit1<-train(classe ~ . , method="ctree", data=trainPC)
pred1<-predict(modFit1, newdata=testPC, method="glm")
tab<-confusionMatrix(test_train$classe, pred1)$table
accuracy1<-sum(diag(tab))/sum(tab)
accuracy1
## [1] 0.9043331
Below here I try a predictor with PCA + linear discriminant analysis.
modFit2<-train(classe ~ ., method="lda", data=trainPC)
pred2<-predict(modFit2, newdata=testPC, method="glm")
tab<-confusionMatrix(test_train$classe, pred2)$table
accuracy2<-sum(diag(tab))/sum(tab)
accuracy2
## [1] 0.7296517
Below here I try a predictor with linear discriminant analysis.
modFit3<-train(classe ~ PC1+PC2+PC3+PC4 +PC5+PC6 + PC7 +PC8+PC9+PC10+PC11+PC12+PC13+PC14+PC15+PC16 +PC17+PC18+PC19+PC20, method="rpart", data=trainPC)
pred3<-predict(modFit3, newdata=testPC, method="glm")
tab<-confusionMatrix(test_train$classe, pred3)$table
accuracy3<-sum(diag(tab))/sum(tab)
accuracy3 #too low, so I won't consider it later for the voting stacking.
## [1] 0.3687341
Below here I try a predictor with flexible discriminant analysis.
modFit4<-train(classe ~ PC1+PC2+PC3+PC4 +PC5+PC6 + PC7 +PC8+PC9+PC10+PC11+PC12+PC13+PC14+PC15+PC16 +PC17+PC18+PC19+PC20, method="fda", data=trainPC)
pred4<-predict(modFit4, newdata=testPC, method="glm")
tab<-confusionMatrix(test_train$classe, pred4)$table
accuracy4<-sum(diag(tab))/sum(tab)
accuracy4
## [1] 0.542226
Below here I try a predictor with amdai.
modFit5<-train(classe ~ PC1+PC2+PC3+PC4 +PC5+PC6 + PC7 +PC8+PC9+PC10+PC11+PC12+PC13+PC14+PC15+PC16 +PC17+PC18+PC19+PC20, method="amdai", data=trainPC, na.action = na.omit)
pred5<-predict(modFit5, newdata=testPC, method="glm")
tab<-confusionMatrix(test_train$classe, pred5)$table
accuracy5<-sum(diag(tab))/sum(tab)
accuracy5
## [1] 0.5252336
Below here I tried a predictor with deepboost and it doesn’t work.
#modFit6<-train(classe ~ ., method="deepboost", data=trainPC)
#modFit6<-train(classe ~ PC1+PC2+PC3+PC4 +PC5+PC6 + PC7 +PC8+PC9+PC10+PC11+PC12+PC13+PC14+PC15+PC16 +PC17+PC18+PC19+PC20, method="deepboost", data=trainPC)
Below here I try a predictor with PCA+ quadratic discriminative analysis.
modFit6<-train(classe ~ PC1+PC2+PC3+PC4 +PC5+PC6 + PC7 +PC8+PC9+PC10+PC11+PC12+PC13+PC14+PC15+PC16 +PC17+PC18+PC19+PC20, method="qda", data=trainPC)
pred6<-predict(modFit6, newdata=testPC, method="glm")
tab<-confusionMatrix(test_train$classe, pred6)$table
accuracy6<-sum(diag(tab))/sum(tab)
accuracy6 #too bad for an accuracy.
## [1] 0.6853016
Below here I try a predictor with PCA+ robust linear discriminant analysis.
modFit7<-train(classe ~ PC1+PC2+PC3+PC4 +PC5+PC6 + PC7 +PC8+PC9+PC10+PC11+PC12+PC13+PC14+PC15+PC16 +PC17+PC18+PC19+PC20, method="Linda", data=trainPC)
pred7<-predict(modFit7, newdata=testPC, method="glm")
tab<-confusionMatrix(test_train$classe, pred7)$table
accuracy7<-sum(diag(tab))/sum(tab)
accuracy7
## [1] 0.5119796
Below here I try a predictor with PCA+ heteroscedastic discriminant analysis.
modFit8<-train(classe ~ PC1+PC2+PC3+PC4 +PC5+PC6 + PC7 +PC8+PC9+PC10+PC11+PC12+PC13+PC14+PC15+PC16 +PC17+PC18+PC19+PC20, method="hda", data=trainPC, na.action=na.omit, verbose=F, verboseIter = FALSE)
pred8<-predict(modFit8, newdata=testPC, method="glm")
tab<-confusionMatrix(test_train$classe, pred8)$table
accuracy8<-sum(diag(tab))/sum(tab)
Such method gives a 0.3969414 accuracy, which is too low to be accepted.
Below here I try a predictor with PCA+ high dimensional discriminant analysis.
modFit9<-train(classe ~ PC1+PC2+PC3+PC4 +PC5+PC6 + PC7 +PC8+PC9+PC10+PC11+PC12+PC13+PC14+PC15+PC16 +PC17+PC18+PC19+PC20, method="hdda", data=trainPC, na.action=na.omit)
pred9<-predict(modFit9, newdata=testPC, method="glm")
tab<-confusionMatrix(test_train$classe, pred9)$table
accuracy9<-sum(diag(tab))/sum(tab)
Such method gives a 0.6742566 accuracy, which is enough to be accepted.
Below here I try a predictor with PCA+ least square support vector machine.
modFit10<-train(classe ~ ., method="lssvmLinear", data=trainPC, verbose=F)
pred10<-predict(modFit10, newdata=testPC, method="glm")
tab<-confusionMatrix(test_train$classe, pred10)$table
accuracy10<-sum(diag(tab))/sum(tab)
accuracy10
Below here I try a predictor with PCA+ penalized discriminant analysis.
modFit11<-train(classe ~ ., method="pda", data=trainPC, verbose=F, verboseIter = FALSE)
pred11<-predict(modFit11, newdata=testPC, method="glm")
tab<-confusionMatrix(test_train$classe, pred11)$table
accuracy11<-sum(diag(tab))/sum(tab)
accuracy11
## [1] 0.7294817
Below here I try a predictor with PCA+ penalized multinomial regression.
modFit12<-train(classe ~ ., method="multinom", data=trainPC, verbose=F, verboseIter = FALSE)
pred12<-predict(modFit12, newdata=testPC, method="glm")
tab<-confusionMatrix(test_train$classe, pred12)$table
accuracy12<-sum(diag(tab))/sum(tab)
accuracy12
Such method gives a 0.7765506 accuracy, which is enough to be accepted.
Below here I combine the predictors with accuracy>=0.5 through a random forest. The accuracy should be higher than any other predictor used so far.
predictors<-t(rbind(pred1, pred2, pred3, pred4, pred5, pred6, pred7, pred8, pred9, pred10, pred11, pred12))
dim(predictors)
## [1] 5885 12
accuracies<-c(accuracy1,accuracy2, accuracy3, accuracy4, accuracy5, accuracy6, accuracy7, accuracy8, accuracy9, accuracy10,accuracy11, accuracy12)
good_predictors<-predictors[,which(accuracies>=0.5)]
predDF<-data.frame(good_predictors, classe=test_train$classe)
combModFit<-train(classe~., method="rf", data=predDF)
print(combModFit)
## Random Forest
##
## 5885 samples
## 9 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 5885, 5885, 5885, 5885, 5885, 5885, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9179486 0.8960442
## 5 0.9165510 0.8943051
## 9 0.9126391 0.8893575
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
combPred<-predict(combModFit, predDF)
tab<-confusionMatrix(reference=test_train$classe, combPred)$table
acc_comb<-sum(diag(tab))/sum(tab)
acc_comb
## [1] 0.9423959
As expected the accuracy of the combined predictor is higher than any other predictor used to build it.
Now I will apply the same process on the testing dataset: principal component analysis, compute those predictors that proved enough accurate on training, combine them with the random forest process used above (combModFit).
for(i in 2:ncol(testing)){
testing[which(testing[,i]=="#DIV/0!"),i]<-NA
testing[which(testing[,i]==""),i]<-NA
}
#compute pca on testing
PCtest<-predict(prePROC, testing[, ])
head(PCtest)
## user_name cvtd_timestamp new_window kurtosis_roll_belt
## 1 pedro 05/12/2011 14:23 no NA
## 2 jeremy 30/11/2011 17:11 no NA
## 3 jeremy 30/11/2011 17:11 no NA
## 4 adelmo 02/12/2011 13:33 no NA
## 5 eurico 28/11/2011 14:13 no NA
## 6 jeremy 30/11/2011 17:12 no NA
## kurtosis_picth_belt kurtosis_yaw_belt skewness_roll_belt
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## skewness_roll_belt.1 skewness_yaw_belt max_roll_belt max_picth_belt
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## max_yaw_belt min_roll_belt min_pitch_belt min_yaw_belt
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## amplitude_roll_belt amplitude_pitch_belt amplitude_yaw_belt
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## var_total_accel_belt avg_roll_belt stddev_roll_belt var_roll_belt
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## avg_pitch_belt stddev_pitch_belt var_pitch_belt avg_yaw_belt
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## stddev_yaw_belt var_yaw_belt var_accel_arm avg_roll_arm stddev_roll_arm
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
## 3 NA NA NA NA NA
## 4 NA NA NA NA NA
## 5 NA NA NA NA NA
## 6 NA NA NA NA NA
## var_roll_arm avg_pitch_arm stddev_pitch_arm var_pitch_arm avg_yaw_arm
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
## 3 NA NA NA NA NA
## 4 NA NA NA NA NA
## 5 NA NA NA NA NA
## 6 NA NA NA NA NA
## stddev_yaw_arm var_yaw_arm kurtosis_roll_arm kurtosis_picth_arm
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## kurtosis_yaw_arm skewness_roll_arm skewness_pitch_arm skewness_yaw_arm
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## max_roll_arm max_picth_arm max_yaw_arm min_roll_arm min_pitch_arm
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
## 3 NA NA NA NA NA
## 4 NA NA NA NA NA
## 5 NA NA NA NA NA
## 6 NA NA NA NA NA
## min_yaw_arm amplitude_roll_arm amplitude_pitch_arm amplitude_yaw_arm
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## kurtosis_roll_dumbbell kurtosis_picth_dumbbell kurtosis_yaw_dumbbell
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## skewness_roll_dumbbell skewness_pitch_dumbbell skewness_yaw_dumbbell
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## max_roll_dumbbell max_picth_dumbbell max_yaw_dumbbell min_roll_dumbbell
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## min_pitch_dumbbell min_yaw_dumbbell amplitude_roll_dumbbell
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## amplitude_pitch_dumbbell amplitude_yaw_dumbbell var_accel_dumbbell
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## avg_roll_dumbbell stddev_roll_dumbbell var_roll_dumbbell
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## avg_pitch_dumbbell stddev_pitch_dumbbell var_pitch_dumbbell
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## avg_yaw_dumbbell stddev_yaw_dumbbell var_yaw_dumbbell
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## kurtosis_roll_forearm kurtosis_picth_forearm kurtosis_yaw_forearm
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## skewness_roll_forearm skewness_pitch_forearm skewness_yaw_forearm
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## max_roll_forearm max_picth_forearm max_yaw_forearm min_roll_forearm
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## min_pitch_forearm min_yaw_forearm amplitude_roll_forearm
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## amplitude_pitch_forearm amplitude_yaw_forearm var_accel_forearm
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## avg_roll_forearm stddev_roll_forearm var_roll_forearm avg_pitch_forearm
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## stddev_pitch_forearm var_pitch_forearm avg_yaw_forearm
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## stddev_yaw_forearm var_yaw_forearm problem_id PC1 PC2
## 1 NA NA 1 -4.8456136 3.12307556
## 2 NA NA 2 4.8325429 0.09568837
## 3 NA NA 3 4.9319268 -0.61977056
## 4 NA NA 4 0.3097546 -5.63180259
## 5 NA NA 5 2.4552786 3.31247029
## 6 NA NA 6 4.1771304 0.08324963
## PC3 PC4 PC5 PC6 PC7 PC8
## 1 -3.609626 -0.23310502 0.5203761 1.7882487 1.04564336 -1.1006942
## 2 -3.540289 -0.04832414 -1.0431976 0.2888390 0.75565062 0.6672060
## 3 -2.646099 0.47831746 -0.5409476 -0.1039543 -1.93115888 1.0503413
## 4 -1.564981 0.37532823 1.4487980 0.5031426 0.28390174 0.1439488
## 5 1.533715 -0.34382547 3.7858437 -1.6879175 2.13258155 2.0917799
## 6 -1.411519 2.32656964 -1.7941229 0.5766115 -0.08239423 -0.8452897
## PC9 PC10 PC11 PC12 PC13 PC14
## 1 -0.9037287 0.1034248 0.07920694 0.2580630 -0.2409820 -1.6868410
## 2 -0.4821630 -0.7639900 -0.50440237 0.6149592 0.1489270 -0.7069487
## 3 1.4667181 0.7299646 -0.30651883 0.3091913 -0.1086915 0.4068513
## 4 0.4021018 -1.1168459 -0.45715633 -0.9507718 0.8335725 0.1710266
## 5 -0.6674269 1.2656821 0.58817387 -1.1376817 -0.2618417 -0.8967499
## 6 -1.1338901 0.8004490 0.98608769 1.1225023 0.5345465 0.3983235
## PC15 PC16 PC17 PC18 PC19 PC20
## 1 0.6200221 -0.01340708 0.7540211 2.008930580 -0.96516703 0.7912222
## 2 -0.4056577 -0.31936277 0.7543794 0.498145055 0.07829683 0.9641596
## 3 0.5043399 0.14763325 0.2950655 0.221501560 -0.32673788 0.1192217
## 4 -0.4643979 0.13532929 -1.6080485 -0.240557936 0.60873608 0.4861115
## 5 -0.9360925 -0.09642213 -0.5589761 0.685540455 0.09769428 -1.4680582
## 6 -0.7036718 -0.22307272 -0.5923932 -0.002810166 -0.08646992 -0.7136272
#calculate the different predictors (thos ewith accuracy >=0.5) on the pre-processed dataframe PCtest
predictors_test<-cbind(predict(modFit1, newdata=PCtest), predict(modFit2, newdata=PCtest), predict(modFit4, newdata=PCtest), predict(modFit5, newdata=PCtest),predict(modFit6, newdata=PCtest),predict(modFit7, newdata=PCtest), predict(modFit9, newdata=PCtest), predict(modFit11, newdata=PCtest), predict(modFit12, newdata=PCtest))
dim(predictors_test)
## [1] 20 9
colnames(predictors_test)<-c("pred1", "pred2","pred4","pred5","pred6","pred7","pred9","pred11","pred12")
head(predictors_test)
## pred1 pred2 pred4 pred5 pred6 pred7 pred9 pred11 pred12
## [1,] 2 2 1 4 3 5 3 2 3
## [2,] 1 3 1 1 1 1 1 3 3
## [3,] 1 3 1 1 3 1 3 3 3
## [4,] 1 1 1 1 1 1 1 1 1
## [5,] 1 1 1 1 3 5 3 1 1
## [6,] 5 5 3 3 1 4 1 5 5
predDF_test<-data.frame(predictors_test)
combPred<-predict(combModFit, predDF_test)
The predicted levels of the testing dataset are therefore
B, A, A, A, A, E, D, B, A, A, B, C, B, A, E, E, A, A, A, B